package cre.sample.test.tika;

/**
 * Created by CreGu on 2016/7/25.
 */

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.tika.Tika;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
import org.wltea.analyzer.lucene.IKAnalyzer;
import org.xml.sax.helpers.DefaultHandler;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.*;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;

/**
 *
 * @author Heweipo
 *
 */
public class TikaUtils {

    private static org.apache.lucene.index.IndexWriter indexWriter;

    private static IndexSearcher indexSearcher;

    private static Analyzer analyzer = new IKAnalyzer();

    private static Object lock = new Object();

    public final static String ARTICLE_CONTENT = "fileContent";

    public final static String ARTICLE_NAME = "fileName";

    public final static String ARTICLE_PATH = "filePath";

    public final static String ARTICLE_FRAGMENT = "fragment"; // 测试使用，如果不需要，则设置 ARTICLE_FRAGMENT_USE = false

    public static boolean ARTICLE_FRAGMENT_USE = true;

    protected static final org.slf4j.Logger log = org.slf4j.LoggerFactory.getLogger(TikaUtils.class);


    /**
     * 创建索引
     */
    @SuppressWarnings("unchecked")
    private static boolean writeRepos(IndexWriter indexWriter , File file){
        if(file == null || !file.exists() || !file.canRead()) return false;
        if(file.isDirectory()){
            // 文件夹
            File[] files = file.listFiles();
            for(File f : files){
                writeRepos(indexWriter,f);
            }
        }else{
            // 文件
            Document doc = null;
            ZipFile zf = null;
            try{
                doc = new Document();
                Tika tika = new Tika();
                Metadata metadata = new Metadata();
                metadata.add(Metadata.CONTENT_ENCODING, "utf-8");
                if(file.getName().endsWith(".zip")){
                    zf = new ZipFile(file);
                    List<ZipEntry> list = (List<ZipEntry>) Collections.list(zf.entries());
                    for(ZipEntry entry : list){
                        doc.add(new Field(ARTICLE_CONTENT,tika.parse(zf.getInputStream(entry),metadata)));
                    }
                }else{
                    doc.add(new Field(ARTICLE_CONTENT,tika.parse(new FileInputStream(file),metadata)));
                }
                doc.add(new Field(ARTICLE_NAME,file.getName(),Store.YES,Index.ANALYZED));
                doc.add(new Field(ARTICLE_PATH,file.getAbsolutePath(),Store.YES,Index.NO));
                if(ARTICLE_FRAGMENT_USE){
                    doc.add(new Field(ARTICLE_FRAGMENT,new Tika().parseToString(new FileInputStream(file),metadata),Store.YES,Index.NO));
                }
                indexWriter.addDocument(doc);

                log.info("正在加载文件索引："+file.getName()+" "+file.getAbsolutePath());
            }catch(Exception e){
                log.info("索引创建失败："+e.getMessage());
                e.printStackTrace();
                return false;
            }finally{
                if(zf != null){
                    try {
                        zf.close();
                    } catch (IOException e) {
                        e.printStackTrace();
                        log.error("",e);
                    }
                }
            }
        }
        return true;
    }

    /**
     * 创建索引
     */
    public static boolean writeRepos(File file){
        IndexWriter indexWriter = getIndexWriter();
        if(!writeRepos(indexWriter , file)) return false;
        try {
            // 提交，在这里提交保证程序的原子性
            getIndexWriter().commit();
            log.info("文件索引创建成功："+file.getName()+" "+file.getAbsolutePath());
        } catch (Exception e) {
            log.info("文件索引创建之后提交失败："+e.getMessage());
            e.printStackTrace();
            return false;
        }
        return true;
    }

    /**
     * 读取索引
     */
    public static List<Map<String,String>> readRepos(String[] fields , String value , int numb){
        if(fields == null || fields.length == 0){
            fields = new String[]{TikaUtils.ARTICLE_CONTENT,TikaUtils.ARTICLE_NAME};
        }
        List<Map<String,String>> list = new ArrayList<Map<String,String>>();
        try {
            Query query = new MultiFieldQueryParser(Version.LUCENE_30, fields, analyzer).parse(value);
            TopDocs matchs = getIndexSearcher().search(query, numb);
            numb = Math.min(numb, matchs.totalHits);
            ScoreDoc[] scoreDocs = matchs.scoreDocs;

            // 设置style
            Highlighter lighter = null;
            if(ARTICLE_FRAGMENT_USE){
                Formatter format = new SimpleHTMLFormatter("<font color='red'>","</font>");
                Scorer scorer = new QueryScorer(query);
                Fragmenter fragmenter = new SimpleFragmenter(100);
                lighter = new Highlighter(format,scorer);
                lighter.setTextFragmenter(fragmenter);
            }

            for(int i = 0 ; i < numb ; i++){
                Document document = getIndexSearcher().doc(scoreDocs[i].doc);
                list.add(document2map(document,lighter));
            }
        } catch (Exception e) {
            log.info("搜索索引时异常："+e.getMessage());
            e.printStackTrace();
            return null;
        }
        return list;
    }

    /**
     * transfer
     */
    private static Map<String,String> document2map(Document doc , Highlighter lighter){
        Map<String,String> map = new HashMap<String,String>();
        if(doc == null) return map;
        map.put(ARTICLE_NAME, doc.get(ARTICLE_NAME));
        map.put(ARTICLE_PATH, doc.get(ARTICLE_PATH));
        map.put(ARTICLE_CONTENT, doc.get(ARTICLE_CONTENT));
        if(ARTICLE_FRAGMENT_USE){
            try {
                String fragment = lighter.getBestFragment(analyzer,ARTICLE_FRAGMENT,doc.get(ARTICLE_FRAGMENT));
                if(fragment != null){
                    fragment = fragment.trim().replaceAll("\t", "")
                            .replaceAll("\n", "").replaceAll("\r", "")
                            .replaceAll("\n\r", "");
                    map.put(ARTICLE_FRAGMENT, fragment);
                }
            } catch (IOException e) {
                log.info("document转换Map异常："+e.getMessage());
                e.printStackTrace();
            } catch (InvalidTokenOffsetsException e) {
                log.info("document转换Map异常："+e.getMessage());
                e.printStackTrace();
            }
        }
        return map;
    }

    /**
     * 获取IndexSearcher
     * @return
     */
    public static IndexSearcher getIndexSearcher(){
        // 单例
        if(indexSearcher == null){
            synchronized(lock){
                Directory directory = null;
                try {
                    directory = FSDirectory.open(new File("./Repos"));
                    indexSearcher = new IndexSearcher(directory);
                    return indexSearcher;
                }catch(IOException e){
                    log.info("获取IndexSearcher异常："+e.getMessage());
                    e.printStackTrace();
                }
            }
        }
        // 实时更新
        try {
            IndexReader indexReader = indexSearcher.getIndexReader();
            if(!indexReader.isCurrent()){
                indexSearcher = new IndexSearcher(indexReader.reopen(true));
                return indexSearcher;
            }
        } catch (CorruptIndexException e) {
            e.printStackTrace();
            log.info("获取IndexSearcher异常："+e.getMessage());
        } catch (IOException e) {
            e.printStackTrace();
            log.info("获取IndexSearcher异常："+e.getMessage());
        }
        return indexSearcher;
    }


    /**
     * 获取IndexWriter
     * @return
     */
    public static IndexWriter getIndexWriter(){
        if(indexWriter == null){
            FSDirectory directory = null;
            synchronized (lock) {
                File file = new File("./Repos");
                if(!file.exists()) file.mkdir();
                try {
                    directory = FSDirectory.open(file);
                    indexWriter = new IndexWriter(directory,analyzer,MaxFieldLength.UNLIMITED);
                } catch (IOException e) {
                    e.printStackTrace();
                    log.info("获取IndexWriter异常："+e.getMessage());
                }
            }
        }
        return indexWriter;
    }



    /**
     * 通用分词器
     */
    public static Analyzer getAnalyzer(){
        return analyzer;
    }

    public static String fileToTxt(File file) {
        org.apache.tika.parser.Parser parser = new AutoDetectParser();
        try {
            InputStream inputStream = new FileInputStream(file);
            DefaultHandler handler = new BodyContentHandler();
            Metadata metadata = new Metadata();
            ParseContext parseContext = new ParseContext();
            parseContext.set(org.apache.tika.parser.Parser.class, parser);
            parser.parse(inputStream,handler,metadata,parseContext);
            for (String string : metadata.names()) {
                System.out.println(string+":"+metadata.get(string));
            }
            inputStream.close();
            return handler.toString();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }

    public static void main(String[] args) throws Exception {
        // 创建 Tika实例
        Tika tika = new Tika();
        // 使用tika对文件内容进行抽取，对常用的文档格式进行测试
//      String filePath="D:/tikatest/test.pdf";
      String filePath="D:/tikatest/test.doc";
//      String filePath="D:/tikatest/test.docx";
//      String filePath="D:/tikatest/test.txt";
//      String filePath="D:/tikatest/test.ppt";
//      String filePath="D:/tikatest/test.wps";
//        String filePath="G:/tika/test.pdf";
        File file = new File(filePath);
        String text = tika.parseToString(file);
        System.out.println(text);

        System.out.println(fileToTxt(file));

        TikaUtils.writeRepos(file);
        List<Map<String, String>> map = TikaUtils.
                readRepos(null, "接口", 5);

        System.out.println(map);
    }


}
